/*
  author Sylvain Bertrand <sylvain.bertrand@gmail.com>
  Protected by linux GNU GPLv2
  Copyright 2012-2014
*/
#include <linux/pci.h>
#include <linux/cdev.h>
#include <asm/unaligned.h>
#include <linux/math64.h>

#include <alga/rng_mng.h>
#include <uapi/alga/pixel_fmts.h>
#include <alga/timing.h>
#include <alga/amd/atombios/atb.h>
#include <uapi/alga/amd/dce6/dce6.h>
#include <alga/amd/atombios/vm.h>
#include <alga/amd/atombios/cm.h>
#include <alga/amd/atombios/pp.h>
#include <alga/amd/atombios/vram_info.h>

#include "../mc.h"
#include "../rlc.h"
#include "../ih.h"
#include "../fence.h"
#include "../ring.h"
#include "../dmas.h"
#include "../ba.h"
#include "../cps.h"
#include "../gpu.h"
#include "../drv.h"

#include "../bif.h"

#include "../cm.h"
#include "../smc.h"
#include "../smc_tbls.h"

#include "../regs.h"

#include "ctx.h"
#include "private.h"
#include "pwrtune.h"
#include "fixp.h"

#ifdef CONFIG_ALGA_AMD_SI_DYN_PM_LOG
#define L(fmt,...) printk(KERN_INFO fmt "\n", ##__VA_ARGS__)
void smc_cac_cfg_tbl_dump(struct smc_cac_cfg_tbl *tbl)
{
	u16 tmp16;
	u32 tmp32;
	u8 i;
	u8 j;

	L("SMC_CAC_CFG_TBL START");

	for (i = 0; i < SMC_LKGE_LUT_TEMPS_N; ++i)
		for (j = 0; j < SMC_LKGE_LUT_VOLTS_N; ++j) {
			tmp16 = get_unaligned_be16(&tbl->lkge_lut[i][j]);
			L("lkge_lut[%u][%u]=0x%04x",i,j,tmp16);
		}
	tmp32 = get_unaligned_be32(&tbl->lkge_lut_v0);
	L("lkge_lut_v0=0x%08x",tmp32);

	tmp32 = get_unaligned_be32(&tbl->lkge_lut_vstep);
	L("lkge_lut_vstep=0x%08x",tmp32);
	
	tmp32 = get_unaligned_be32(&tbl->wnd_time);
	L("wnd_time=0x%08x",tmp32);

	tmp32 = get_unaligned_be32(&tbl->r_ll);
	L("r_ll=0x%08x",tmp32);

	tmp32 = get_unaligned_be32(&tbl->calculation_repeats);
	L("calculation_repeats=0x%08x",tmp32);

	tmp32 = get_unaligned_be32(&tbl->l2_num_min_tdp);
	L("l2_num_min_tdp=0x%08x",tmp32);

	tmp32 = get_unaligned_be32(&tbl->dc_cac);
	L("dc_cac=0x%08x",tmp32);

	L("lts_truncate_n=0x%02x",tbl->lts_truncate_n);
	L("shift_n=0x%02x",tbl->shift_n);
	L("pg_lkge_scale_log2=0x%02x",tbl->pg_lkge_scale_log2);
	L("cac_temp=0x%02x",tbl->cac_temp);

	tmp32 = get_unaligned_be32(&tbl->lkge_lut_t0);
	L("lkge_lut_t0=0x%08x",tmp32);

	tmp32 = get_unaligned_be32(&tbl->lkge_lut_tstep);
	L("lkge_lut_tstep=0x%08x",tmp32);

	L("SMC_CAC_CFG_TBL END");
}
#endif

static void vddc_min_max(struct ctx *ctx, u16 *min, u16 *max)
{
	u32 entry;
	struct pwrtune *pwrtune;
	u32 v0_loadline;

	*max = 0;
	*min = ~0;

	for (entry = 0; entry < ctx->atb_cac_lkge_tbl.entries_n; ++entry) {
		struct atb_cac_lkge *atb_cac_lkge;

		atb_cac_lkge = &ctx->atb_cac_lkge_tbl.entries[entry];

		if (atb_cac_lkge->vddc_mv > *max)
			*max = atb_cac_lkge->vddc_mv;
		if (atb_cac_lkge->vddc_mv < *min)
			*min = atb_cac_lkge->vddc_mv;
	}	

	/* adjust the min vddc based on the pwrtune lkge min, v0 */
	pwrtune = pwrtune_get(ctx->dev);
	v0_loadline = (*min) * (100 - pwrtune->lkge_lut_v0_percent) / 100;

	*min = v0_loadline;
}

static u32 lkge_from_volt_and_temp(struct ctx *ctx, u16 volt, s32 temp_s32)
{
	struct pwrtune *pwrtune;
	struct lkge_coefs *coefs;
	s64 lkge;
	s64 vddc;
	s64 temp;
	s64 temp_slope;
	s64 temp_intercept;
	s64 av;
	s64 bv;
	s64 temp_ref;
	s64 tmp;
	s64 kt;
	s64 kv;
	s64 lkge_w;

	pwrtune = pwrtune_get(ctx->dev);
	coefs = &pwrtune->lkge_coefs;
	
	lkge = div64_s64(s322fixp((s32)ctx->atb_cac_lkge), 100);
	vddc = div64_s64(s322fixp((s32)volt), 1000);
	temp = div64_s64(s322fixp(temp_s32), 1000);

	temp_slope = div64_s64(s322fixp(coefs->temp_slope), 100000000);
	temp_intercept = div64_s64(s322fixp(coefs->temp_intercept),
								100000000);

	av = div64_s64(s322fixp((s32)coefs->av), 100000000);
	bv = div64_s64(s322fixp((s32)coefs->bv), 100000000);

	temp_ref = s322fixp(coefs->temp_ref);

	tmp = fixp_mul(temp_slope, vddc) + temp_intercept;
	kt = fixp_exp(fixp_mul(tmp, temp));
	kt = fixp_div(kt, fixp_exp(fixp_mul(tmp, temp_ref)));
	kv = fixp_mul(av, fixp_exp(fixp_mul(bv, vddc)));

	lkge_w = fixp_mul(fixp_mul(fixp_mul(lkge, kt), kv), vddc);
	return (u32)fixp2s32(lkge_w * 1000);
}

#define TSTEP 4
#define T0 60
static void temp_process(struct ctx *ctx, u8 temp_idx,
				struct smc_cac_cfg_tbl *tbl, u16 vddc_min,
						u16 vddc_max, u16 vddc_step)
{
	u8 volt_idx;
	s32 temp;

	temp = (1000 * (temp_idx * TSTEP + T0));

	for (volt_idx = 0; volt_idx < SMC_LKGE_LUT_VOLTS_N; ++volt_idx) {
		u16 volt;
		u32 lkge; /* watt units */
		u32 smc_lkge;
		u8 slot_volt_idx;
		__be16 *slot;

		volt = vddc_max - (vddc_step * volt_idx);

		lkge = lkge_from_volt_and_temp(ctx, volt, temp);

		/* scale and cap */
		smc_lkge = lkge >> 2;
		if (smc_lkge > 0xffff)
			smc_lkge = 0xffff;

		slot_volt_idx = SMC_LKGE_LUT_VOLTS_N - 1 - volt_idx;
		slot = &tbl->lkge_lut[temp_idx][slot_volt_idx];
		put_unaligned_be16((u16)smc_lkge, slot);
	}
}

static void lkge_lut_init(struct ctx *ctx, struct smc_cac_cfg_tbl *tbl,
				u16 vddc_min, u16 vddc_max, u16 vddc_step)
{
	u8 temp_idx;

	for (temp_idx = 0; temp_idx < SMC_LKGE_LUT_TEMPS_N; ++temp_idx)
		temp_process(ctx, temp_idx, tbl, vddc_min, vddc_max, vddc_step);
}

static u32 wnd_time_compute(struct ctx *ctx)
{
	u32 cac_wnd;
	u32 cac_wnd_sz;
	struct pwrtune *pwrtune;

	pwrtune = pwrtune_get(ctx->dev);
	cac_wnd = get(CCC_CAC_WND, pwrtune->cac_wnd);

	cac_wnd_sz = ((cac_wnd & 0xffff0000) >> 16) * (cac_wnd & 0x0000ffff);

	return (cac_wnd_sz * 100) / ctx->gpu_aux_clk; 
}

void smc_cac_cfg_tbl_init(struct ctx *ctx, struct smc_cac_cfg_tbl *tbl)
{
	u16 vddc_min;
	u16 vddc_max;
	u16 vddc_step;
	u32 wnd_time;
	u32 load_line_slope;
	struct pwrtune *pwrtune;

	LOG("smc calculation accumulator configuration table init");

	vddc_min_max(ctx, &vddc_min, &vddc_max);

	vddc_step = ((vddc_max - vddc_min) + (SMC_LKGE_LUT_VOLTS_N - 1))
							/ SMC_LKGE_LUT_VOLTS_N;
	/* adjust again the vddc_min based on volt steps n */
	vddc_min = vddc_max - (vddc_step * (SMC_LKGE_LUT_VOLTS_N - 1));

	lkge_lut_init(ctx, tbl, vddc_min, vddc_max, vddc_step);
	put_unaligned_be32(vddc_min, &tbl->lkge_lut_v0);
	put_unaligned_be32(vddc_step, &tbl->lkge_lut_vstep);

	wnd_time = wnd_time_compute(ctx);
	put_unaligned_be32(wnd_time, &tbl->wnd_time);

	load_line_slope = ((u32)ctx->atb_load_line_slope
					<< SMC_LOAD_LINE_SLOPE_SCALE_R) / 100;
	put_unaligned_be32(load_line_slope, &tbl->r_ll);

	put_unaligned_be32(2, &tbl->calculation_repeats);

	pwrtune = pwrtune_get(ctx->dev);
	put_unaligned_be32(pwrtune->l2_lta_wnd_sz_default,
							&tbl->l2_num_min_tdp);
	/* tbl->dc_cac = 0 */
	tbl->lts_truncate_n = pwrtune->lts_truncate_n_default;
	tbl->shift_n = pwrtune->shift_n_default;
	tbl->pg_lkge_scale_log2 = 12;
	tbl->cac_temp = pwrtune->operating_temp;
	put_unaligned_be32(T0, &tbl->lkge_lut_t0);
	put_unaligned_be32(TSTEP, &tbl->lkge_lut_tstep);
}

void smc_cac_cfg_tbl_sw_regs_init(struct ctx *ctx)
{
	u32 us_ticks_n;

	LOG("smc calculation accumulator software registers init");

	us_ticks_n = ctx->gpu_aux_clk / 100;
	smc_sw_wr32(ctx->dev, us_ticks_n, SMC_SW_TICKS_PER_US);
}
